knitr::opts_knit$set(root.dir = "/Users/TARDIS/Documents/STUDIES/context_word_seg")
library(ProjectTemplate)
load.project()

General descriptives

How many utterances in each context?

######
# WL #
######
WL <- df_WL %>% 
  mutate_each(funs(ifelse(. > 0, 1, 0)), -utt, -orth, -phon)  # for word list analysis only, make all contexts either 1 or 0 (smoothing over 1.5's from expand_windows)

WL_sum <- data.frame(context = names(colSums(dplyr::select(WL, -utt, -orth, -phon))), n=colSums(dplyr::select(WL, -utt, -orth, -phon))) %>% 
  mutate(method="word lists")
WL_sum %>% 
  select(-method) %>% 
  arrange(desc(n)) %>% 
  kable()
context n
body_touch 1530
meal 942
fussing 741
bath 664
bed 570
play 528
diaper_dressing 471
media 20
######
# HJ #
######
HJ <- df_HJ_bin
HJ_sum <- data.frame(context=names(colSums(dplyr::select(HJ, -utt, -orth, -phon), na.rm=TRUE)), n=colSums(dplyr::select(df_HJ_bin, -utt, -orth, -phon), na.rm=TRUE)) %>% 
  mutate(method="coder judgments")
HJ_sum %>% 
  select(-method) %>% 
  arrange(desc(n)) %>% 
  kable()
context n
playtime 4789
interaction 4515
fussing 2739
mealtime 1849
bathtime 1215
none 914
diaper change 663
dressing 412
sleep 410
touching 270
housework 269
taking pictures 113
hiccups 85
outside 45
TV 43
sick 18
coughing 15
swimming 15
stretching 13
sneezing 10
music 6
drooling 4
blowing bubbles 2
friends 1
pre-doctors visit 1
######
# TM #
######
LDA <- df_LDA_bin
LDA_sum <- data.frame(context=names(colSums(dplyr::select(LDA, -utt, -orth, -phon), na.rm=TRUE)), n=colSums(dplyr::select(LDA, -utt, -orth, -phon), na.rm=TRUE)) %>% 
  mutate(method="LDA")
LDA_sum %>% 
  select(-method) %>% 
  arrange(desc(n)) %>% 
  kable()
context n
topic_11 2292
topic_5 1887
topic_3 1616
topic_4 1544
topic_2 1478
topic_7 1251
topic_6 1187
topic_9 1176
topic_8 1090
topic_12 1020
topic_1 736
topic_10 613
STM <- df_STM_bin
STM_sum <- data.frame(context=names(colSums(dplyr::select(STM, -utt, -orth, -phon), na.rm=TRUE)), n=colSums(dplyr::select(STM, -utt, -orth, -phon), na.rm=TRUE)) %>% 
  mutate(method="STM")
STM_sum %>% 
  select(-method) %>% 
  arrange(desc(n)) %>% 
  kable()
context n
topic_3 2174
topic_10 1938
topic_4 1775
topic_7 1733
topic_1 1575
topic_2 1392
topic_8 1066
topic_6 930
topic_9 845
topic_5 840
topic_12 548
topic_11 540
all_sum <- rbind(WL_sum, HJ_sum, LDA_sum, STM_sum)

ggplot(all_sum, aes(y=n, x=reorder(as.factor(context), n))) + 
  geom_bar(stat = 'identity', show.legend = FALSE) + 
  geom_hline(aes(yintercept=nrow(df)), lty=2) + 
  facet_wrap(~method, scales = "free_x", ncol=1) + 
  ggtitle("Number of utterances in each context\ndashed line shows total corpus size")

ggsave(filename ="utts_per_context.png" , path = "graphs/descriptives", width = 5, height = 4, units="in")

How many contexts per utterance?

######
# WL #
######
WL$N.contexts <- rowSums(dplyr::select(WL, -utt, -orth, -phon))
table(WL$N.contexts)
## 
##    0    1    2    3 
## 8334 3924  693   52
WL <- extract_contexts(WL)
summary(WL$context) # the number of utterances in each context
##       ambiguous            bath             bed      body_touch 
##             745             440             377            1147 
## diaper_dressing         fussing            meal           media 
##             295             557             706              12 
##  no context tag            play 
##            8334             390
length(which(WL$N.contexts > 2)) / length(WL$N.contexts) # percent of corpus with > 2 contexts
## [1] 0.003999077
length(which(WL$N.contexts == 0)) / length(WL$N.contexts) # percent of corpus with 0 contexts
## [1] 0.640929
length(which(WL$N.contexts > 2)) / length(which(WL$N.contexts > 0)) 
## [1] 0.01113729
length(which(WL$N.contexts > 1)) / length(which(WL$N.contexts > 0)) 
## [1] 0.1595631
######
# HJ #
######
HJ$N.contexts <- rowSums(dplyr::select(HJ, -utt, -orth, -phon), na.rm = TRUE)
table(HJ$N.contexts)
## 
##    0    1    2    3 
##  550 6889 5165  399
HJ <- extract_contexts(HJ)
summary(HJ$context) # the number of utterances in each context
##       ambiguous        bathtime   diaper change        dressing 
##            5564             864             289             148 
##         fussing         hiccups       housework     interaction 
##             807              17              95            1325 
##        mealtime  no context tag            none         outside 
##             800             550             169               6 
##        playtime            sick           sleep taking pictures 
##            2189               2             126              11 
##        touching              TV 
##              22              19
length(which(HJ$N.contexts > 2)) / length(HJ$N.contexts) # percent of corpus with > 2 contexts
## [1] 0.03068523
length(which(HJ$N.contexts == 0)) / length(HJ$N.contexts) # percent of corpus with 0 contexts
## [1] 0.04229793
######
# TM #
######
LDA$N.contexts <- rowSums(dplyr::select(LDA, -utt, -orth, -phon), na.rm = TRUE)
table(LDA$N.contexts)
## 
##    0    1    2    3 
##   31 9100 3350   30
LDA <- extract_contexts(LDA)
summary(LDA$context) # the number of utterances in each context
##      ambiguous no context tag        topic_1       topic_10       topic_11 
##           3380             31            286            300           1440 
##       topic_12        topic_2        topic_3        topic_4        topic_5 
##            570            968            610           1065           1137 
##        topic_6        topic_7        topic_8        topic_9 
##            679            837            758            450
length(which(LDA$N.contexts > 2)) / length(LDA$N.contexts) # percent of corpus with > 2 contexts
## [1] 0.00239789
length(which(LDA$N.contexts == 0)) / length(LDA$N.contexts) # percent of corpus with 0 contexts
## [1] 0.00247782
STM$N.contexts <- rowSums(dplyr::select(STM, -utt, -orth, -phon), na.rm = TRUE)
table(STM$N.contexts)
## 
##    0    1    2    3 
##  213 9300 2938   60
STM <- extract_contexts(STM)
summary(STM$context) # the number of utterances in each context
##      ambiguous no context tag        topic_1       topic_10       topic_11 
##           2998            213            621            936            390 
##       topic_12        topic_2        topic_3        topic_4        topic_5 
##            428            750           1432           1178            540 
##        topic_6        topic_7        topic_8        topic_9 
##            540           1256            766            463
length(which(STM$N.contexts > 2)) / length(STM$N.contexts) # percent of corpus with > 2 contexts
## [1] 0.00479578
length(which(STM$N.contexts == 0)) / length(STM$N.contexts) # percent of corpus with 0 contexts
## [1] 0.01702502
###########
# OVERALL #
###########
wl_contexts <- dplyr::select(WL, utt, N.contexts, context) %>% 
  mutate(method = "word list")
hj_contexts <- dplyr::select(HJ, utt, N.contexts, context) %>% 
  mutate(method = "coder judgments")
lda_contexts <- dplyr::select(LDA, utt, N.contexts, context) %>% 
  mutate(method = "LDA")
stm_contexts <- dplyr::select(STM, utt, N.contexts, context) %>% 
  mutate(method = "STM")

all_methods_counts <- rbind(wl_contexts, hj_contexts, lda_contexts, stm_contexts) %>% 
  count(method, N.contexts) 

ggplot(all_methods_counts, aes(y=n, x=method, fill=as.factor(N.contexts))) + 
  geom_bar(stat = "identity") + 
  labs(title = "Number of contexts tagged per utterance", y="Number of utterances", x = "Context defined by") + 
  scale_fill_discrete(name="Number of contexts")

ggsave(filename ="contexts_per_utt_1.png" , path = "graphs/descriptives", width = 5, height = 4, units="in")

HJ_no_none <- dplyr::select(HJ, -none, -N.contexts, -context) 
HJ_no_none$N.contexts <- rowSums(dplyr::select(HJ_no_none, -utt, -orth, -phon), na.rm = TRUE)
HJ_no_none <- extract_contexts(HJ_no_none)
  
hj_no_none <- dplyr::select(HJ_no_none, utt, N.contexts, context) %>%
  mutate(method = "coder judgments")

all_methods_counts_no_none <- rbind(wl_contexts, hj_no_none, lda_contexts, stm_contexts) %>% 
  count(method, N.contexts) 

ggplot(all_methods_counts_no_none, aes(y=n, x=method, fill=as.factor(N.contexts))) + 
  geom_bar(stat = "identity") + 
  labs(title = "Number of contexts tagged per utterance,\nnot including 'none' for coder judgments", y="Number of utterances", x = "Context defined by") + 
  scale_fill_discrete(name="Number of contexts")

ggsave(filename ="contexts_per_utt_2.png" , path = "graphs/descriptives", width = 5, height = 4, units="in")

table(hj_contexts$N.contexts)
## 
##    0    1    2    3 
##  550 6889 5165  399
table(hj_no_none$N.contexts)
## 
##    0    1    2    3 
##  719 7357 4636  291
all_methods_context_count <- rbind(wl_contexts, hj_contexts, lda_contexts, stm_contexts) %>% 
  count(method, context)
all_methods_context_count$context <- relevel(all_methods_context_count$context, "none" )
all_methods_context_count$context <- relevel(all_methods_context_count$context, "no context tag" )
all_methods_context_count$context <- relevel(all_methods_context_count$context, "ambiguous" )

ggplot(all_methods_context_count, aes(y=n, x=method, fill=context)) + 
  geom_bar(stat= "identity") + 
  scale_fill_manual(values = c("#0072B2", "#D55E00", "#E69F00", rep("#999999", length(levels(all_methods_context_count$context)) - 3)))

Printing context files for sharing with CF:

all <- rbind(wl_contexts, hj_no_none, lda_contexts, stm_contexts) %>% 
  dplyr::select(utt, context, method) %>% 
  tidyr::extract(utt, into = c("child", "age", "utt.num"), regex = "^([[:alpha:]]{2})([[:digit:]]{2})[.]cha_([[:digit:]]+)$")

all$utt.num <- as.numeric(all$utt.num)
  
all %>% 
  dplyr::filter(method=="word list") %>% 
  tidyr::spread(key=utt.num, value=context) %>% 
  dplyr::select(-method) %>% 
  write.csv("/Users/TARDIS/Dropbox/2_RoseM_TP/context_files/contexts_file_WL.csv", row.names=FALSE)
all %>% 
  dplyr::filter(method=="coder judgments") %>% 
  tidyr::spread(key=utt.num, value=context) %>% 
  dplyr::select(-method) %>% 
  write.csv("/Users/TARDIS/Dropbox/2_RoseM_TP/context_files/contexts_file_HJ.csv", row.names=FALSE)  
all %>% 
  dplyr::filter(method=="LDA") %>% 
  tidyr::spread(key=utt.num, value=context) %>% 
  dplyr::select(-method) %>% 
  write.csv("/Users/TARDIS/Dropbox/2_RoseM_TP/context_files/contexts_file_LDA.csv", row.names=FALSE)
all %>% 
  dplyr::filter(method=="STM") %>% 
  tidyr::spread(key=utt.num, value=context) %>% 
  dplyr::select(-method) %>% 
  write.csv("/Users/TARDIS/Dropbox/2_RoseM_TP/context_files/contexts_file_STM.csv", row.names=FALSE)  

Select context files to use

Dropping “none” codes from HJ.

WL <- WL
HJ <- HJ_no_none
LDA <- LDA 
STM <- STM 

General thoughts

The word list method can (and does) leave large portions of the corpus uncoded, unlike the coder judgment or topic modeling definitions of context. Coders were instructed to respond with “none” if they encountered a section of the corpus where they could not identify any activity context; there are 914 utterances in the corpus that are above threshold for the “none” code, indicating agreement across coders that those utterances do not have an identifiable activity context. This is a much smaller portion of the corpus (7%) than the uncoded section left by the word list method (64.1%), however.

The word list and coder definitions of context naturally produce a skewed distribution of activities, whereas the topic modeling approaches discover a more uniform distribution of activities. To the extent that the distritbution of activities is naturally skewed (e.g. that a few activities happen very often, and many others happen rarely), topic modeling approaches to identifying activity context may distort reality.

Word list descriptives

Frequency of WL key words?

contexts <- names(WL_contexts)
WL_context_data <- data.frame(NULL)
for(k in contexts){
  WL_context_data <- rbind(WL_context_data, data.frame(word=WL_contexts[[k]], context=k, stringsAsFactors = FALSE))
}

orth_stream <- paste(df$orth, collapse = " ")
# flag bigrams from WL keywords in orth stream, so they don't get separated
for(w_bar in grep(x = WL_context_data$word, pattern = "_", value=TRUE)){
  w_space <- gsub(x=w_bar, pattern="_", replacement=" ")
  orth_stream <- gsub(x=orth_stream, pattern = w_space, replacement = w_bar)
}

orth_stream <- strsplit(orth_stream, split=" ")[[1]]
orth_stream <- orth_stream[orth_stream != ""]

orth_data <- data.frame(word=orth_stream, stringsAsFactors = FALSE) %>% 
  count(word)

WL_context_data <- left_join(WL_context_data, orth_data, by="word") %>% 
  arrange(context, n)

WL_context_data %>% 
  group_by(context) %>% 
  summarise(total=sum(n, na.rm=TRUE), 
            mean.freq=mean(n, na.rm=TRUE), 
            highest=max(n, na.rm=TRUE),
            which.highest=word[which.max(n)]) %>% 
  kable()
context total mean.freq highest which.highest
bath 287 15.105263 47 bath
bed 150 13.636364 43 tired
body_touch 550 20.370370 123 tickle
diaper_dressing 116 6.444444 50 nappie
fussing 216 19.636364 56 ssh
meal 303 8.416667 27 eat
media 4 2.000000 3 television
play 136 7.157895 65 play

HJ

What raw coder tags make up each category?

HJ_contexts_list <- colnames(dplyr::select(df_HJ_bin, - utt, -orth, -phon))

HJ_codes_df <- HJ_processedcodes %>% 
  dplyr::mutate(value=1) %>% 
  tidyr::spread(key=category, value=value, fill=0) %>% 
  dplyr::rename(orth=context)

for(k in HJ_contexts_list){
  message(k)
  cloud_from_df(HJ_codes_df, which.context = k, min.freq = 0)
}
## bathtime
## blowing bubbles

## coughing

## diaper change

## dressing

## drooling

## friends

## fussing

## hiccups

## housework

## interaction

## mealtime

## music

## none

## outside

## playtime

## pre-doctors visit

## sick

## sleep

## sneezing

## stretching

## swimming

## taking pictures

## touching

## TV

What words are associated with each category?

for(k in HJ_contexts_list){
  message(k)
  cloud_from_df(df_HJ_bin, k)
}
## bathtime
## blowing bubbles

## coughing

## diaper change

## dressing

## drooling

## friends

## fussing

## hiccups

## housework

## interaction

## mealtime

## music

## none

## outside

## playtime

## pre-doctors visit

## sick

## sleep

## sneezing

## stretching

## swimming

## taking pictures

## touching

## TV

Do “none” contexts from HJ match utterances with 0 context codes from WL?

df_HJ_none <- df_HJ_bin
df_HJ_none$HJ_none <- ifelse(df_HJ_none$none==1, 1,
                             ifelse(df_HJ_none$none==0, 0, NA))
df_HJ_none <- dplyr::select(df_HJ_none, utt, HJ_none) 

df_WL_0 <- WL
df_WL_0$WL_0 <- ifelse(df_WL_0$N.contexts == 0, 1,
                             ifelse(df_WL_0$N.contexts > 0, 0, NA))
df_WL_0 <- dplyr::select(df_WL_0, utt, WL_0) 

match <- full_join(df_HJ_none, df_WL_0, by="utt") %>% 
  mutate(match = HJ_none + WL_0)
match$match <- ifelse(match$match == 2, 1, 
                      ifelse(match$match < 2, 0, NA))
nrow(df_WL_0); nrow(df_HJ_none); nrow(match)
## [1] 13003
## [1] 13003
## [1] 13003
tab <- xtabs( ~ WL_0 + HJ_none, data = match)

addmargins(tab)
##      HJ_none
## WL_0      0     1   Sum
##   0    4405   178  4583
##   1    7423   736  8159
##   Sum 11828   914 12742
summary(tab)
## Call: xtabs(formula = ~WL_0 + HJ_none, data = match)
## Number of cases in table: 12742 
## Number of factors: 2 
## Test for independence of all factors:
##  Chisq = 116.29, df = 1, p-value = 4.101e-27
mosaic(tab)

assocplot(tab)

assocstats(tab)
##                     X^2 df P(> X^2)
## Likelihood Ratio 127.25  1        0
## Pearson          116.29  1        0
## 
## Phi-Coefficient   : 0.096 
## Contingency Coeff.: 0.095 
## Cramer's V        : 0.096

What percent of the “none” context utterances in HJ method are 0 context in WL?

sum(match$match, na.rm=TRUE) / sum(match$HJ_none, na.rm=TRUE)
## [1] 0.8052516

What percent of the 0 context utterances in WL method are “none” context in HJ?

sum(match$match, na.rm=TRUE) / sum(match$WL_0, na.rm=TRUE)
## [1] 0.08831293

Topic modeling: LDA

What words are associated with each LDA topic?

top_words_lda <- top.topic.words(lda$topics, num.words=10) %>% 
  as.data.frame(stringsAsFactors = FALSE)
colnames(top_words_lda) <- paste0("topic_", 1:ncol(top_words_lda)) 
top_words_lda %>% 
  kable()
topic_1 topic_2 topic_3 topic_4 topic_5 topic_6 topic_7 topic_8 topic_9 topic_10 topic_11 topic_12
oh hey go yes hello yes oh ya oh oh mm oh
hey oh hey oh hmm oh come go go yes mummi yes
dear mummi want well oh tell dear come come ah yes tickl
hannah yes can boo dear go yes oh yes dear eh dear
alright hannah hmm hello got hey tell littl ya one girl go
trea hello come go gillian come good got eh clean hello come
ssh look nice say matter dear got yes bath two come got
yum hold see come smile look darl look alright wee oh big
darl smile smile dear look stori girl like get dirti good see
like shh littl clever hey daddi mum want like now alright good

LDAvis: http://bl.ocks.org/rosemm/raw/a7b1ac43ffe3b49229ed5e866762613f/

# http://cpsievert.github.io/LDAvis/reviews/reviews.html
alpha <-  0.1 # from lda package demo
eta <-  0.1 # from lda package demo
theta <- t(apply(lda$document_sums + alpha, 2, function(x) x/sum(x)))
phi <- t(apply(t(lda$topics) + eta, 2, function(x) x/sum(x)))

D <- length(TM_doc_prep_out$documents)  # number of documents 
W <- length(TM_doc_prep_out$vocab)  # number of terms in the vocab
doc.length <- document.lengths(TM_doc_prep_out$documents)  # number of tokens per document
N <- sum(doc.length)  # total number of tokens in the data
term.frequency <- word.counts(TM_doc_prep_out$documents, vocab = TM_doc_prep_out$vocab)


lda_data <- list(phi = phi,
                 theta = theta,
                 doc.length = doc.length,
                 vocab = TM_doc_prep_out$vocab,
                 term.frequency = as.integer(term.frequency))

json <- createJSON(phi = phi, 
                   theta = theta, 
                   doc.length = doc.length, 
                   vocab = TM_doc_prep_out$vocab, 
                   term.frequency = as.integer(term.frequency))

serVis(json, as.gist = TRUE) 

Wordles, showing the frequencies of words in the utterances assigned to each topic.

cloud_from_df(df_LDA_bin, "topic_1")

cloud_from_df(df_LDA_bin, "topic_2")

cloud_from_df(df_LDA_bin, "topic_3")

cloud_from_df(df_LDA_bin, "topic_4")

cloud_from_df(df_LDA_bin, "topic_5")

cloud_from_df(df_LDA_bin, "topic_6")

cloud_from_df(df_LDA_bin, "topic_7")

cloud_from_df(df_LDA_bin, "topic_8")

cloud_from_df(df_LDA_bin, "topic_9")

cloud_from_df(df_LDA_bin, "topic_10")

cloud_from_df(df_LDA_bin, "topic_11")
## Warning in wordcloud::wordcloud(words, freq, min.freq = min.freq): you
## could not be fit on page. It will not be plotted.

cloud_from_df(df_LDA_bin, "topic_12")

Topic modeling: STM

What words are associated with each STM topic?

summary(stm)
## A topic model with 12 topics, 427 documents and a 531 word dictionary.
## Topic Words:
##  Topic 1: side, lost, dear, round, ha, said, teddi 
##  Topic 2: bop, hello, monkey, give, hi, thing, say 
##  Topic 3: shh, hey, shake, rabbit, els, chou, light 
##  Topic 4: door, lunch, minut, sit, lucki, sweetheart, watch 
##  Topic 5: tickl, toe, feet, din, tick, tum, boy 
##  Topic 6: away, one, nappi, will, wee, sorri, milk 
##  Topic 7: hair, water, bad, fun, minut, deari, drink 
##  Topic 8: bum, bath, splash, swim, shake, dad, mum 
##  Topic 9: boo, bye, ah, clever, hide, see, tea 
##  Topic 10: hand, see, naughti, fed, can, bite, look 
##  Topic 11: juic, whether, duck, quack, miss, leav, madam 
##  Topic 12: dear, got, oh, hello, shame 
##  
##  Covariate Words:
##  Group cr: grumpi, gut, gorgeous, hmm, full, fast, shall 
##  Group gi: jumper, throat, without, keep, christoph, bash, reach 
##  Group gl: precious, handi, snooz, temper, mat, slept, happi 
##  Group la: hannah, busi, usual, nosh, prove, mummi, whose 
##  Group st: mobil, oven, cradl, appl, pussycat, cupsi, shoulder 
##  
##  Topic-Covariate Interactions:
##  Topic 1, Group cr: pretti, forget, nail, whee, let, dress, tear 
##  Topic 1, Group gi: clap, hand, hiccough, might, hah, christoph, left 
##  Topic 1, Group gl: struggl, walk, lost, aw, danc, punch, chin 
##  Topic 1, Group la: dub, bib, dear, bath, someth, hurt, whee 
##  Topic 1, Group st: hmm, stuck, okay, push, myron, kick, camera 
##  
##  Topic 2, Group cr: ticki, shake, excit, bore, gillian, chou, camera 
##  Topic 2, Group gi: oop, us, stuff, bash, christoph, hello, friend 
##  Topic 2, Group gl: hmm 
##  Topic 2, Group la: pretti, skirt, seen, sunshin, kick, hello, bop 
##  Topic 2, Group st: foot, attent, grab, must, nose, matter, play 
##  
##  Topic 3, Group cr: shush, matter, pleas, dolli, gillian, ah, hey 
##  Topic 3, Group gi: littl, re, wind, kick, shake, lad, chou 
##  Topic 3, Group gl: bop, rest, talk, oop, tri, bore, bib 
##  Topic 3, Group la: polli, near, thank, oop, bubbl, treasur, ho 
##  Topic 3, Group st: food, swing, smile, myron, suppos, move, ssh 
##  
##  Topic 4, Group cr: girl, love, nose, bubbl, littl, bless 
##  Topic 4, Group gi: gonna, lad, ticklish, clean, never, mum, hous 
##  Topic 4, Group gl: first, know, ssh, stuff, wet, will, happen 
##  Topic 4, Group la: iron, arm, downstair, cuddl, god, mm, quiet 
##  Topic 4, Group st: daddi, goin, readi, burp, wind, ya, wash 
##  
##  Topic 5, Group cr: belli, soft, wee, stori, din, nose, shout 
##  Topic 5, Group gi: clever, parrot, yum, complain, sack, daisi, terribl 
##  Topic 5, Group gl: find, tum, old, hah, dinner, thumb, lie 
##  Topic 5, Group la: excit, three, beebo, two, tum, get, one 
##  Topic 5, Group st: feet, toe, boy 
##  
##  Topic 6, Group cr: sleepi, huh, head, shout, dolli, eye, chair 
##  Topic 6, Group gi: burp, suck, vest, carri, roll, bib, finger 
##  Topic 6, Group gl: hide, astra, three, ah, daisi, hiccup, chang 
##  Topic 6, Group la: sunshin, clean, wee, straight, nappi, morn, one 
##  Topic 6, Group st: charl, bib, ever, gonna, forget, tri, whee 
##  
##  Topic 7, Group cr: gonna, tell, cross, patch, shout, madam, stori 
##  Topic 7, Group gi: fatti, relax, grab, ow, walk, lift, dad 
##  Topic 7, Group gl: petal, tale, pop, pet, bath, wash, dirti 
##  Topic 7, Group la: dub, stori, tell, daddi, rub, mum, wash 
##  Topic 7, Group st: lulu, hot, bottl, wide, juic, garden, first 
##  
##  Topic 8, Group cr: bath, bum 
##  Topic 8, Group gi: arm, struggl, naughti, stick, nail, wash, let 
##  Topic 8, Group gl: stori, wee, bubbl, next, ha, aw, mess 
##  Topic 8, Group la: will, splish, splosh, tonight, whee, air, mm 
##  Topic 8, Group st: say, anyth, cuddl, myron, shake, milk, hello 
##  
##  Topic 9, Group cr: oop, cross, patch, shush, sack, re, babi 
##  Topic 9, Group gi: parrot, everyth, best, mouth, cup, teeth, hair 
##  Topic 9, Group gl: astra, myron, anoth, tomorrow, yeah, clean, one 
##  Topic 9, Group la: lambchop, sleepi, watch, chou, love, girl, tri 
##  Topic 9, Group st: hide, stand, ah, littl, bye 
##  
##  Topic 10, Group cr: toe, stretch, game, play, can, tongu, goin 
##  Topic 10, Group gi: clap, splash, shout, push, stand, soon, ya 
##  Topic 10, Group gl: pretti, pram, can, bear, afternoon, play, tell 
##  Topic 10, Group la: lambchop, chou, friend, littl, smile, hmm, daddi 
##  Topic 10, Group st: polli, lewi, pastri, joseph, rain, though, thank 
##  
##  Topic 11, Group cr: attent, ya, bib, matter, think, minut, alright 
##  Topic 11, Group gi: duck, quack, bath, will, look, left, minut 
##  Topic 11, Group gl: get, sunshin, dress, tongu, arm, stori, never 
##  Topic 11, Group la: paddi, donald, yum, quack, came, point, silli 
##  Topic 11, Group st: meringu, dub, pie, lemon, rub, joseph, cupboard 
##  
##  Topic 12, Group cr: shame, burp, fatti, hicki, mind, poor, never 
##  Topic 12, Group gi: nose, stretch, daddi, size, christoph 
##  Topic 12, Group gl: smile, daddi, dad, bless, tis, stay, girli 
##  Topic 12, Group la: ssh, trea, pie, alright, armi, darl, funni 
##  Topic 12, Group st:  
## 
# http://cpsievert.github.io/LDAvis/reviews/reviews.html
toLDAvis(stm, TM_doc_prep_out$documents, as.gist = TRUE) # This function does not yet allow content covariates.

Wordles, showing the probability of each word given the topic.

cloud(stm, topic=1)

cloud(stm, topic=2)

cloud(stm, topic=3)

cloud(stm, topic=4)

cloud(stm, topic=5)

cloud(stm, topic=6)

cloud(stm, topic=7)

cloud(stm, topic=8)

cloud(stm, topic=9)

cloud(stm, topic=10)

cloud(stm, topic=11)

cloud(stm, topic=12)

Wordles, showing the frequencies of words in the utterances assigned to each topic.

cloud_from_df(df_STM_bin, "topic_1")

cloud_from_df(df_STM_bin, "topic_2")

cloud_from_df(df_STM_bin, "topic_3")

cloud_from_df(df_STM_bin, "topic_4")

cloud_from_df(df_STM_bin, "topic_5")

cloud_from_df(df_STM_bin, "topic_6")

cloud_from_df(df_STM_bin, "topic_7")

cloud_from_df(df_STM_bin, "topic_8")

cloud_from_df(df_STM_bin, "topic_9")

cloud_from_df(df_STM_bin, "topic_10")

cloud_from_df(df_STM_bin, "topic_11")

cloud_from_df(df_STM_bin, "topic_12")